{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.017446,
     "end_time": "2021-01-20T00:11:08.026346",
     "exception": false,
     "start_time": "2021-01-20T00:11:08.008900",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Encoder + MLP + CVTuner\n",
    "The idea of using an encoder is the denoise the data. After many attempts at using a unsupervised autoencoder, the choice landed on a bottleneck encoder as this will preserve the intra-feature relations. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-08T06:47:54.236821Z",
     "start_time": "2021-01-08T06:47:54.227568Z"
    },
    "execution": {
     "iopub.execute_input": "2021-01-20T00:11:08.069817Z",
     "iopub.status.busy": "2021-01-20T00:11:08.068925Z",
     "iopub.status.idle": "2021-01-20T00:11:15.454267Z",
     "shell.execute_reply": "2021-01-20T00:11:15.453013Z"
    },
    "papermill": {
     "duration": 7.411363,
     "end_time": "2021-01-20T00:11:15.454401",
     "exception": false,
     "start_time": "2021-01-20T00:11:08.043038",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation\n",
    "from tensorflow.keras.models import Model, Sequential\n",
    "from tensorflow.keras.losses import BinaryCrossentropy\n",
    "from tensorflow.keras.optimizers import Adam\n",
    "from tensorflow.keras.callbacks import EarlyStopping\n",
    "from tensorflow.keras.layers.experimental.preprocessing import Normalization\n",
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import GroupKFold\n",
    "\n",
    "from tqdm import tqdm\n",
    "from random import choices\n",
    "\n",
    "import kerastuner as kt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.015807,
     "end_time": "2021-01-20T00:11:15.487291",
     "exception": false,
     "start_time": "2021-01-20T00:11:15.471484",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# PurgedGroupTimeSeriesSplit——根据时序划分数据集\n",
    "Click the code button to see. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-01-20T00:11:15.533417Z",
     "iopub.status.busy": "2021-01-20T00:11:15.526897Z",
     "iopub.status.idle": "2021-01-20T00:11:15.536119Z",
     "shell.execute_reply": "2021-01-20T00:11:15.535460Z"
    },
    "papermill": {
     "duration": 0.03276,
     "end_time": "2021-01-20T00:11:15.536217",
     "exception": false,
     "start_time": "2021-01-20T00:11:15.503457",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "class CVTuner(kt.engine.tuner.Tuner):\n",
    "    def run_trial(self, trial, X, y, splits, batch_size=32, epochs=1,callbacks=None):\n",
    "        val_losses = []\n",
    "        for train_indices, test_indices in splits:\n",
    "            X_train, X_test = [x[train_indices] for x in X], [x[test_indices] for x in X]\n",
    "            y_train, y_test = [a[train_indices] for a in y], [a[test_indices] for a in y]\n",
    "            if len(X_train) < 2:\n",
    "                X_train = X_train[0]\n",
    "                X_test = X_test[0]\n",
    "            if len(y_train) < 2:\n",
    "                y_train = y_train[0]\n",
    "                y_test = y_test[0]\n",
    "            \n",
    "            model = self.hypermodel.build(trial.hyperparameters)\n",
    "            hist = model.fit(X_train,y_train,\n",
    "                      validation_data=(X_test,y_test),\n",
    "                      epochs=epochs,\n",
    "                        batch_size=batch_size,\n",
    "                      callbacks=callbacks)\n",
    "            \n",
    "            val_losses.append([hist.history[k][-1] for k in hist.history])\n",
    "        val_losses = np.asarray(val_losses)\n",
    "        self.oracle.update_trial(trial.trial_id, {k:np.mean(val_losses[:,i]) for i,k in enumerate(hist.history.keys())})\n",
    "        self.save_model(trial.trial_id, model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-08T06:47:56.000272Z",
     "start_time": "2021-01-08T06:47:55.969097Z"
    },
    "_kg_hide-input": true,
    "_kg_hide-output": true,
    "execution": {
     "iopub.execute_input": "2021-01-20T00:11:15.593200Z",
     "iopub.status.busy": "2021-01-20T00:11:15.582300Z",
     "iopub.status.idle": "2021-01-20T00:11:15.600596Z",
     "shell.execute_reply": "2021-01-20T00:11:15.600090Z"
    },
    "papermill": {
     "duration": 0.048069,
     "end_time": "2021-01-20T00:11:15.600701",
     "exception": false,
     "start_time": "2021-01-20T00:11:15.552632",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples\n",
    "from sklearn.utils.validation import _deprecate_positional_args\n",
    "\n",
    "# modified code for group gaps; source\n",
    "# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243\n",
    "class PurgedGroupTimeSeriesSplit(_BaseKFold):\n",
    "    \"\"\"Time Series cross-validator variant with non-overlapping groups.\n",
    "    Allows for a gap in groups to avoid potentially leaking info from\n",
    "    train into test if the model has windowed or lag features.\n",
    "    Provides train/test indices to split time series data samples\n",
    "    that are observed at fixed time intervals according to a\n",
    "    third-party provided group.\n",
    "    In each split, test indices must be higher than before, and thus shuffling\n",
    "    in cross validator is inappropriate.\n",
    "    This cross-validation object is a variation of :class:`KFold`.\n",
    "    In the kth split, it returns first k folds as train set and the\n",
    "    (k+1)th fold as test set.\n",
    "    The same group will not appear in two different folds (the number of\n",
    "    distinct groups has to be at least equal to the number of folds).\n",
    "    Note that unlike standard cross-validation methods, successive\n",
    "    training sets are supersets of those that come before them.\n",
    "    Read more in the :ref:`User Guide <cross_validation>`.\n",
    "    Parameters\n",
    "    ----------\n",
    "    n_splits : int, default=5\n",
    "        Number of splits. Must be at least 2.\n",
    "    max_train_group_size : int, default=Inf\n",
    "        Maximum group size for a single training set.\n",
    "    group_gap : int, default=None\n",
    "        Gap between train and test\n",
    "    max_test_group_size : int, default=Inf\n",
    "        We discard this number of groups from the end of each train split\n",
    "    \"\"\"\n",
    "\n",
    "    @_deprecate_positional_args\n",
    "    def __init__(self,\n",
    "                 n_splits=5,\n",
    "                 *,\n",
    "                 max_train_group_size=np.inf,\n",
    "                 max_test_group_size=np.inf,\n",
    "                 group_gap=None,\n",
    "                 verbose=False\n",
    "                 ):\n",
    "        super().__init__(n_splits, shuffle=False, random_state=None)\n",
    "        self.max_train_group_size = max_train_group_size\n",
    "        self.group_gap = group_gap\n",
    "        self.max_test_group_size = max_test_group_size\n",
    "        self.verbose = verbose\n",
    "\n",
    "    def split(self, X, y=None, groups=None):\n",
    "        \"\"\"Generate indices to split data into training and test set.\n",
    "        Parameters\n",
    "        ----------\n",
    "        X : array-like of shape (n_samples, n_features)\n",
    "            Training data, where n_samples is the number of samples\n",
    "            and n_features is the number of features.\n",
    "        y : array-like of shape (n_samples,)\n",
    "            Always ignored, exists for compatibility.\n",
    "        groups : array-like of shape (n_samples,)\n",
    "            Group labels for the samples used while splitting the dataset into\n",
    "            train/test set.\n",
    "        Yields\n",
    "        ------\n",
    "        train : ndarray\n",
    "            The training set indices for that split.\n",
    "        test : ndarray\n",
    "            The testing set indices for that split.\n",
    "        \"\"\"\n",
    "        if groups is None:\n",
    "            raise ValueError(\n",
    "                \"The 'groups' parameter should not be None\")\n",
    "        X, y, groups = indexable(X, y, groups)\n",
    "        n_samples = _num_samples(X)\n",
    "        n_splits = self.n_splits\n",
    "        group_gap = self.group_gap\n",
    "        max_test_group_size = self.max_test_group_size\n",
    "        max_train_group_size = self.max_train_group_size\n",
    "        n_folds = n_splits + 1\n",
    "        group_dict = {}\n",
    "        u, ind = np.unique(groups, return_index=True)\n",
    "        unique_groups = u[np.argsort(ind)]\n",
    "        n_samples = _num_samples(X)\n",
    "        n_groups = _num_samples(unique_groups)\n",
    "        for idx in np.arange(n_samples):\n",
    "            if (groups[idx] in group_dict):\n",
    "                group_dict[groups[idx]].append(idx)\n",
    "            else:\n",
    "                group_dict[groups[idx]] = [idx]\n",
    "        if n_folds > n_groups:\n",
    "            raise ValueError(\n",
    "                (\"Cannot have number of folds={0} greater than\"\n",
    "                 \" the number of groups={1}\").format(n_folds,\n",
    "                                                     n_groups))\n",
    "\n",
    "        group_test_size = min(n_groups // n_folds, max_test_group_size)\n",
    "        group_test_starts = range(n_groups - n_splits * group_test_size,\n",
    "                                  n_groups, group_test_size)\n",
    "        for group_test_start in group_test_starts:\n",
    "            train_array = []\n",
    "            test_array = []\n",
    "\n",
    "            group_st = max(0, group_test_start - group_gap - max_train_group_size)\n",
    "            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:\n",
    "                train_array_tmp = group_dict[train_group_idx]\n",
    "                \n",
    "                train_array = np.sort(np.unique(\n",
    "                                      np.concatenate((train_array,\n",
    "                                                      train_array_tmp)),\n",
    "                                      axis=None), axis=None)\n",
    "\n",
    "            train_end = train_array.size\n",
    " \n",
    "            for test_group_idx in unique_groups[group_test_start:\n",
    "                                                group_test_start +\n",
    "                                                group_test_size]:\n",
    "                test_array_tmp = group_dict[test_group_idx]\n",
    "                test_array = np.sort(np.unique(\n",
    "                                              np.concatenate((test_array,\n",
    "                                                              test_array_tmp)),\n",
    "                                     axis=None), axis=None)\n",
    "\n",
    "            test_array  = test_array[group_gap:]\n",
    "            \n",
    "            \n",
    "            if self.verbose > 0:\n",
    "                    pass\n",
    "                    \n",
    "            yield [int(i) for i in train_array], [int(i) for i in test_array]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.015832,
     "end_time": "2021-01-20T00:11:15.632865",
     "exception": false,
     "start_time": "2021-01-20T00:11:15.617033",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### Loading the training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-08T06:50:51.883506Z",
     "start_time": "2021-01-08T06:49:46.876468Z"
    },
    "execution": {
     "iopub.execute_input": "2021-01-20T00:11:15.678588Z",
     "iopub.status.busy": "2021-01-20T00:11:15.677748Z",
     "iopub.status.idle": "2021-01-20T00:12:33.539274Z",
     "shell.execute_reply": "2021-01-20T00:12:33.538621Z"
    },
    "papermill": {
     "duration": 77.890435,
     "end_time": "2021-01-20T00:12:33.539412",
     "exception": false,
     "start_time": "2021-01-20T00:11:15.648977",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style type='text/css'>\n",
       ".datatable table.frame { margin-bottom: 0; }\n",
       ".datatable table.frame thead { border-bottom: none; }\n",
       ".datatable table.frame tr.coltypes td {  color: #FFFFFF;  line-height: 6px;  padding: 0 0.5em;}\n",
       ".datatable .bool    { background: #DDDD99; }\n",
       ".datatable .object  { background: #565656; }\n",
       ".datatable .int     { background: #5D9E5D; }\n",
       ".datatable .float   { background: #4040CC; }\n",
       ".datatable .str     { background: #CC4040; }\n",
       ".datatable .row_index {  background: var(--jp-border-color3);  border-right: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  font-size: 9px;}\n",
       ".datatable .frame tr.coltypes .row_index {  background: var(--jp-border-color0);}\n",
       ".datatable th:nth-child(2) { padding-left: 12px; }\n",
       ".datatable .hellipsis {  color: var(--jp-cell-editor-border-color);}\n",
       ".datatable .vellipsis {  background: var(--jp-layout-color0);  color: var(--jp-cell-editor-border-color);}\n",
       ".datatable .na {  color: var(--jp-cell-editor-border-color);  font-size: 80%;}\n",
       ".datatable .footer { font-size: 9px; }\n",
       ".datatable .frame_dimensions {  background: var(--jp-border-color3);  border-top: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  display: inline-block;  opacity: 0.6;  padding: 1px 10px 1px 5px;}\n",
       "</style>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data ready!\n"
     ]
    }
   ],
   "source": [
    "# 加载训练数据\n",
    "# 定义TRAINING来控制到底是训练还是提交，训练时TRAINING = True，预测时TRAINING = False\n",
    "TRAINING = False \n",
    "USE_FINETUNE = False     \n",
    "FOLDS = 4\n",
    "SEED = 42\n",
    "# 读取数据,并用一部分数据集当我们的训练集\n",
    "import datatable as dt\n",
    "train = dt.fread('../input/jane-street-market-prediction/train.csv').to_pandas()\n",
    "#train = pd.read_csv('../input/jane-street-market-prediction/train.csv')\n",
    "print('data ready!')\n",
    "# 先用查询表达式'date > 85'进行查询，再重置为整数索引\n",
    "train = train.query('date > 85').reset_index(drop = True) \n",
    "# 将float64 => float32,缩小内存\n",
    "train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use\n",
    "# 缺失值的填充\n",
    "train.fillna(train.mean(),inplace=True)\n",
    "# 选取满足条件的训练集\n",
    "train = train.query('weight > 0').reset_index(drop = True)\n",
    "# 构建action列\n",
    "#train['action'] = (train['resp'] > 0).astype('int')\n",
    "train['action'] =  (  (train['resp_1'] > 0 ) & (train['resp_2'] > 0 ) & (train['resp_3'] > 0 ) & (train['resp_4'] > 0 ) &  (train['resp'] > 0 )   ).astype('int')\n",
    "# 130个feature\n",
    "features = [c for c in train.columns if 'feature' in c]\n",
    "\n",
    "resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']\n",
    "\n",
    "X = train[features].values\n",
    "y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T #Multitarget\n",
    "# 每列的均值\n",
    "f_mean = np.mean(train[features[1:]].values,axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.017523,
     "end_time": "2021-01-20T00:12:33.573933",
     "exception": false,
     "start_time": "2021-01-20T00:12:33.556410",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### Creating the autoencoder. \n",
    "The autoencoder should aid in denoising the data. Based on [this](https://www.semanticscholar.org/paper/Deep-Bottleneck-Classifiers-in-Supervised-Dimension-Parviainen/fb86483f7573f6430fe4597432b0cd3e34b16e43) paper. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.017378,
     "end_time": "2021-01-20T00:12:33.608975",
     "exception": false,
     "start_time": "2021-01-20T00:12:33.591597",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### 自编码器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-08T07:06:18.567941Z",
     "start_time": "2021-01-08T07:06:18.553402Z"
    },
    "execution": {
     "iopub.execute_input": "2021-01-20T00:12:33.658516Z",
     "iopub.status.busy": "2021-01-20T00:12:33.656637Z",
     "iopub.status.idle": "2021-01-20T00:12:33.659246Z",
     "shell.execute_reply": "2021-01-20T00:12:33.659735Z"
    },
    "papermill": {
     "duration": 0.033836,
     "end_time": "2021-01-20T00:12:33.659856",
     "exception": false,
     "start_time": "2021-01-20T00:12:33.626020",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def create_autoencoder(input_dim,output_dim,noise=0.05):\n",
    "    i = Input(input_dim)\n",
    "    # 自编码部分\n",
    "    # 自编码器— x = decoder(encoder(x)) => 130 -> 64 -> 64 -> 130\n",
    "    # 编码器—对数据进行降维\n",
    "    encoded = BatchNormalization()(i)\n",
    "    encoded = GaussianNoise(noise)(encoded)\n",
    "    encoded = Dense(64,activation='relu')(encoded)\n",
    "    # 解码器\n",
    "    # 对数据进行升维\n",
    "    decoded = Dropout(0.2)(encoded)\n",
    "    decoded = Dense(input_dim,name='decoded')(decoded)\n",
    "    \n",
    "    # 将解码后的数据再训练一个分类模型\n",
    "    x = Dense(32,activation='relu')(decoded)\n",
    "    x = BatchNormalization()(x)\n",
    "    x = Dropout(0.2)(x)\n",
    "    #x = Dense(32,activation='relu')(x)\n",
    "    #x = BatchNormalization()(x)\n",
    "    #x = Dropout(0.2)(x)    \n",
    "    x = Dense(output_dim,activation='sigmoid',name='label_output')(x)\n",
    "    \n",
    "    encoder = Model(inputs=i,outputs=encoded)\n",
    "    autoencoder = Model(inputs=i,outputs=[decoded,x])\n",
    "    # 损失函数由二部分构成。损失：均方误差。分类：交叉熵损失\n",
    "    autoencoder.compile(optimizer=Adam(0.001),loss={'decoded':'mse','label_output':'binary_crossentropy'})\n",
    "    return autoencoder, encoder"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.017015,
     "end_time": "2021-01-20T00:12:33.694569",
     "exception": false,
     "start_time": "2021-01-20T00:12:33.677554",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### Creating the MLP（全连接网络）. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-08T07:02:28.261279Z",
     "start_time": "2021-01-08T07:02:28.248517Z"
    },
    "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
    "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
    "execution": {
     "iopub.execute_input": "2021-01-20T00:12:33.734122Z",
     "iopub.status.busy": "2021-01-20T00:12:33.733402Z",
     "iopub.status.idle": "2021-01-20T00:12:33.738929Z",
     "shell.execute_reply": "2021-01-20T00:12:33.738444Z"
    },
    "papermill": {
     "duration": 0.027531,
     "end_time": "2021-01-20T00:12:33.739037",
     "exception": false,
     "start_time": "2021-01-20T00:12:33.711506",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"\\ndef create_model(input_dim,output_dim,encoder):\\n    inputs = Input(input_dim)\\n    # encoder进行降维，可以学习到数据集更有效的表征方法\\n    x = encoder(inputs)\\n    x = Concatenate()([x,inputs]) #use both raw and encoded features\\n    x = BatchNormalization()(x)\\n    x = Dropout(0.13)(x)\\n    # 多个隐藏层\\n    hidden_units = [384, 896, 896, 394]\\n    for idx, hidden_unit in enumerate(hidden_units):\\n        x = Dense(hidden_unit)(x)\\n        x = BatchNormalization()(x)\\n        x = Lambda(tf.keras.activations.relu)(x)\\n        x = Dropout(0.25)(x)\\n    # 输出\\n    x = Dense(output_dim,activation='sigmoid')(x)\\n    model = Model(inputs=inputs,outputs=x)\\n    # label_smoothing标签平滑操作\\n    model.compile(optimizer=Adam(0.0005),loss=BinaryCrossentropy(label_smoothing=0.05),\\n                  metrics=[tf.keras.metrics.AUC(name = 'auc')])\\n    return model\\n\""
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''\n",
    "def create_model(input_dim,output_dim,encoder):\n",
    "    inputs = Input(input_dim)\n",
    "    # encoder进行降维，可以学习到数据集更有效的表征方法\n",
    "    x = encoder(inputs)\n",
    "    x = Concatenate()([x,inputs]) #use both raw and encoded features\n",
    "    x = BatchNormalization()(x)\n",
    "    x = Dropout(0.13)(x)\n",
    "    # 多个隐藏层\n",
    "    hidden_units = [384, 896, 896, 394]\n",
    "    for idx, hidden_unit in enumerate(hidden_units):\n",
    "        x = Dense(hidden_unit)(x)\n",
    "        x = BatchNormalization()(x)\n",
    "        x = Lambda(tf.keras.activations.relu)(x)\n",
    "        x = Dropout(0.25)(x)\n",
    "    # 输出\n",
    "    x = Dense(output_dim,activation='sigmoid')(x)\n",
    "    model = Model(inputs=inputs,outputs=x)\n",
    "    # label_smoothing标签平滑操作\n",
    "    model.compile(optimizer=Adam(0.0005),loss=BinaryCrossentropy(label_smoothing=0.05),\n",
    "                  metrics=[tf.keras.metrics.AUC(name = 'auc')])\n",
    "    return model\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-01-20T00:12:33.820613Z",
     "iopub.status.busy": "2021-01-20T00:12:33.819687Z",
     "iopub.status.idle": "2021-01-20T00:12:33.823054Z",
     "shell.execute_reply": "2021-01-20T00:12:33.822479Z"
    },
    "papermill": {
     "duration": 0.06702,
     "end_time": "2021-01-20T00:12:33.823149",
     "exception": false,
     "start_time": "2021-01-20T00:12:33.756129",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "#另一个方法,自动搜索\n",
    "def create_model(hp,input_dim,output_dim,encoder):\n",
    "    inputs = Input(input_dim)\n",
    "    \n",
    "    x = encoder(inputs)\n",
    "    x = Concatenate()([x,inputs]) #use both raw and encoded features\n",
    "    x = BatchNormalization()(x)\n",
    "    x = Dropout(hp.Float('init_dropout',0.0,0.5))(x)\n",
    "    \n",
    "    for i in range(hp.Int('num_layers',1,3)):\n",
    "        x = Dense(hp.Int('num_units_{i}',64,256))(x)\n",
    "        x = BatchNormalization()(x)\n",
    "        x = Lambda(tf.keras.activations.swish)(x)\n",
    "        x = Dropout(hp.Float(f'dropout_{i}',0.0,0.5))(x)\n",
    "    x = Dense(output_dim,activation='sigmoid')(x)\n",
    "    model = Model(inputs=inputs,outputs=x)\n",
    "    model.compile(optimizer=Adam(hp.Float('lr',0.00001,0.1,default=0.001)),\n",
    "                  loss=BinaryCrossentropy(label_smoothing=hp.Float('label_smoothing',0.0,0.1)),\n",
    "                  metrics=[tf.keras.metrics.AUC(name = 'auc')])\n",
    "    return model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.017496,
     "end_time": "2021-01-20T00:12:33.858816",
     "exception": false,
     "start_time": "2021-01-20T00:12:33.841320",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### Defining and training the autoencoder. \n",
    "We add gaussian noise with mean and std from training data. After training we lock the layers in the encoder from further training. \n",
    "定义与训练自编码器,我们对训练数据的均值与方差添加了高斯噪声；训练结束后，我们锁定编码器中的层，避免进一步训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-08T06:53:26.369557Z",
     "start_time": "2021-01-08T06:51:23.43509Z"
    },
    "execution": {
     "iopub.execute_input": "2021-01-20T00:12:33.903398Z",
     "iopub.status.busy": "2021-01-20T00:12:33.902675Z",
     "iopub.status.idle": "2021-01-20T00:12:37.297255Z",
     "shell.execute_reply": "2021-01-20T00:12:37.296069Z"
    },
    "papermill": {
     "duration": 3.41984,
     "end_time": "2021-01-20T00:12:37.297379",
     "exception": false,
     "start_time": "2021-01-20T00:12:33.877539",
     "status": "completed"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 753 ms, sys: 748 ms, total: 1.5 s\n",
      "Wall time: 3.39 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "autoencoder, encoder = create_autoencoder(X.shape[-1],y.shape[-1],noise=0.1)\n",
    "#如果是训练的话运行这里\n",
    "if TRAINING:\n",
    "    autoencoder.fit(X,(X,y),\n",
    "                    epochs=200,\n",
    "                    batch_size=4096, \n",
    "                    validation_split=0.1,\n",
    "                    callbacks=[EarlyStopping('val_loss',patience=10,restore_best_weights=True)])\n",
    "    encoder.save_weights('./encoder.hdf5')\n",
    "#如果是提交成绩的话运行这里\n",
    "else:\n",
    "    encoder.load_weights('../input/janestreetmodel0119/encoder.hdf5')\n",
    "encoder.trainable = False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.018279,
     "end_time": "2021-01-20T00:12:37.335640",
     "exception": false,
     "start_time": "2021-01-20T00:12:37.317361",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "### Running CV\n",
    "Following [this notebook](https://www.kaggle.com/gogo827jz/jane-street-ffill-xgboost-purgedtimeseriescv) which use 5 PurgedGroupTimeSeriesSplit split on the dates in the training data. \n",
    "\n",
    "We add the locked encoder as the first layer of the MLP. This seems to help in speeding up the submission rather than first predicting using the encoder then using the MLP. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.01965,
     "end_time": "2021-01-20T00:12:37.374314",
     "exception": false,
     "start_time": "2021-01-20T00:12:37.354664",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "We use a Baysian Optimizer to find the optimal HPs for out model. 20 trials take about 2 hours on GPU. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-01-20T00:12:37.424823Z",
     "iopub.status.busy": "2021-01-20T00:12:37.423882Z",
     "iopub.status.idle": "2021-01-20T00:12:39.352374Z",
     "shell.execute_reply": "2021-01-20T00:12:39.351617Z"
    },
    "papermill": {
     "duration": 1.959603,
     "end_time": "2021-01-20T00:12:39.352543",
     "exception": false,
     "start_time": "2021-01-20T00:12:37.392940",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.6 s, sys: 80.1 ms, total: 1.68 s\n",
      "Wall time: 1.92 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#训练与预测\n",
    "model_fn = lambda hp: create_model(hp,X.shape[-1],y.shape[-1],encoder)\n",
    "\n",
    "tuner = CVTuner(\n",
    "        hypermodel=model_fn,\n",
    "        oracle=kt.oracles.BayesianOptimization(\n",
    "        objective= kt.Objective('val_auc', direction='max'),\n",
    "        num_initial_points=4,\n",
    "        max_trials=20))\n",
    "\n",
    "FOLDS = 5\n",
    "SEED = 42\n",
    "#如果是训练的话运行这里\n",
    "if TRAINING:\n",
    "    gkf = PurgedGroupTimeSeriesSplit(n_splits = FOLDS, group_gap=20)\n",
    "    splits = list(gkf.split(y, groups=train['date'].values))\n",
    "    tuner.search((X,),(y,),splits=splits,batch_size=4096,epochs=100,callbacks=[EarlyStopping('val_auc', mode='max',patience=3)])\n",
    "    hp  = tuner.get_best_hyperparameters(1)[0]\n",
    "    pd.to_pickle(hp,f'./best_hp_{SEED}.pkl')\n",
    "    for fold, (train_indices, test_indices) in enumerate(splits):\n",
    "        model = model_fn(hp)\n",
    "        X_train, X_test = X[train_indices], X[test_indices]\n",
    "        y_train, y_test = y[train_indices], y[test_indices]\n",
    "        model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=100,batch_size=4096,callbacks=[EarlyStopping('val_auc',mode='max',patience=10,restore_best_weights=True)])\n",
    "        model.save_weights(f'./model_{SEED}_{fold}.hdf5')\n",
    "        model.compile(Adam(hp.get('lr')/100),loss='binary_crossentropy')\n",
    "        model.fit(X_test,y_test,epochs=3,batch_size=4096)\n",
    "        model.save_weights(f'./model_{SEED}_{fold}_finetune.hdf5')\n",
    "    tuner.results_summary()\n",
    "#如果是预测的话运行这里\n",
    "else:\n",
    "    models = []\n",
    "    hp = pd.read_pickle(f'../input/janestreetmodel0119/best_hp_{SEED}.pkl')\n",
    "    for f in range(FOLDS):\n",
    "        model = model_fn(hp)\n",
    "        if USE_FINETUNE:\n",
    "            model.load_weights(f'../input/janestreetmodel0119/model_{SEED}_{f}_finetune.hdf5')\n",
    "        else:\n",
    "            model.load_weights(f'../input/janestreetmodel0119/model_{SEED}_{f}.hdf5')\n",
    "        models.append(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-08T07:19:30.42412Z",
     "start_time": "2021-01-08T07:06:25.315363Z"
    },
    "execution": {
     "iopub.execute_input": "2021-01-20T00:12:39.398986Z",
     "iopub.status.busy": "2021-01-20T00:12:39.398267Z",
     "iopub.status.idle": "2021-01-20T00:12:39.405199Z",
     "shell.execute_reply": "2021-01-20T00:12:39.404568Z"
    },
    "papermill": {
     "duration": 0.031845,
     "end_time": "2021-01-20T00:12:39.405298",
     "exception": false,
     "start_time": "2021-01-20T00:12:39.373453",
     "status": "completed"
    },
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"\\n# 训练与预测\\nFOLDS = 4\\nSEED = 2021\\noof = np.zeros((X.shape[0],5))\\n#如果是训练的话运行这里\\nif TRAINING:\\n    gkf = PurgedGroupTimeSeriesSplit(n_splits = FOLDS, group_gap=20)\\n    splits = list(gkf.split(y, groups=train['date'].values))\\n\\n    for fold, (train_indices, test_indices) in enumerate(splits):\\n        model = create_model(130, 5, encoder)\\n        X_train, X_test = X[train_indices], X[test_indices]\\n        y_train, y_test = y[train_indices], y[test_indices]\\n        # 现在训练集上做训练，然后在测试集上做微调\\n        model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=100,batch_size=4096,callbacks=[EarlyStopping('val_auc',mode='max',patience=10,restore_best_weights=True)])\\n        model.save_weights(f'./model_{SEED}_{fold}.hdf5')\\n        model.compile(Adam(0.00001),loss='binary_crossentropy')\\n        model.fit(X_test,y_test,epochs=3,batch_size=4096)\\n        model.save_weights(f'./model_{SEED}_{fold}_finetune.hdf5')\\n#如果是预测的话运行这里\\nelse:\\n    models = []\\n    for f in range(FOLDS):\\n        model = create_model(130, 5, encoder)\\n        if USE_FINETUNE:\\n            model.load_weights(f'../input/janestreetmodels/model_{SEED}_{f}_finetune.hdf5')\\n        else:\\n            model.load_weights(f'../input/janestreetmodels/model_{SEED}_{f}.hdf5')\\n        models.append(model)\\n\""
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''\n",
    "# 训练与预测\n",
    "FOLDS = 4\n",
    "SEED = 2021\n",
    "oof = np.zeros((X.shape[0],5))\n",
    "#如果是训练的话运行这里\n",
    "if TRAINING:\n",
    "    gkf = PurgedGroupTimeSeriesSplit(n_splits = FOLDS, group_gap=20)\n",
    "    splits = list(gkf.split(y, groups=train['date'].values))\n",
    "\n",
    "    for fold, (train_indices, test_indices) in enumerate(splits):\n",
    "        model = create_model(130, 5, encoder)\n",
    "        X_train, X_test = X[train_indices], X[test_indices]\n",
    "        y_train, y_test = y[train_indices], y[test_indices]\n",
    "        # 现在训练集上做训练，然后在测试集上做微调\n",
    "        model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=100,batch_size=4096,callbacks=[EarlyStopping('val_auc',mode='max',patience=10,restore_best_weights=True)])\n",
    "        model.save_weights(f'./model_{SEED}_{fold}.hdf5')\n",
    "        model.compile(Adam(0.00001),loss='binary_crossentropy')\n",
    "        model.fit(X_test,y_test,epochs=3,batch_size=4096)\n",
    "        model.save_weights(f'./model_{SEED}_{fold}_finetune.hdf5')\n",
    "#如果是预测的话运行这里\n",
    "else:\n",
    "    models = []\n",
    "    for f in range(FOLDS):\n",
    "        model = create_model(130, 5, encoder)\n",
    "        if USE_FINETUNE:\n",
    "            model.load_weights(f'../input/janestreetmodels/model_{SEED}_{f}_finetune.hdf5')\n",
    "        else:\n",
    "            model.load_weights(f'../input/janestreetmodels/model_{SEED}_{f}.hdf5')\n",
    "        models.append(model)\n",
    "'''"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.019369,
     "end_time": "2021-01-20T00:12:39.443276",
     "exception": false,
     "start_time": "2021-01-20T00:12:39.423907",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-01-08T07:25:17.812156Z",
     "start_time": "2021-01-08T07:25:17.797935Z"
    },
    "execution": {
     "iopub.execute_input": "2021-01-20T00:12:39.493925Z",
     "iopub.status.busy": "2021-01-20T00:12:39.493206Z",
     "iopub.status.idle": "2021-01-20T00:18:48.382636Z",
     "shell.execute_reply": "2021-01-20T00:18:48.383298Z"
    },
    "papermill": {
     "duration": 368.921416,
     "end_time": "2021-01-20T00:18:48.383475",
     "exception": false,
     "start_time": "2021-01-20T00:12:39.462059",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "15219it [06:08, 41.26it/s]\n"
     ]
    }
   ],
   "source": [
    "#提交，TRAINING = FALSE才行\n",
    "if not TRAINING:\n",
    "    f = np.median # 中位数\n",
    "    models = models[-2:]\n",
    "    import janestreet\n",
    "    env = janestreet.make_env()\n",
    "    th = 0.5 #0.503\n",
    "    for (test_df, pred_df) in tqdm(env.iter_test()):\n",
    "        if test_df['weight'].item() > 0:\n",
    "            x_tt = test_df.loc[:, features].values\n",
    "            if np.isnan(x_tt[:, 1:].sum()):\n",
    "                # 缺失值的填充\n",
    "                x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean\n",
    "            # 多个模型的均值 \n",
    "            pred = np.mean([model(x_tt, training = False).numpy() for model in models],axis=0)\n",
    "            pred = f(pred)\n",
    "            pred_df.action = np.where(pred >= th, 1, 0).astype(int)\n",
    "        else:\n",
    "            pred_df.action = 0\n",
    "        env.predict(pred_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "papermill": {
     "duration": 1.193428,
     "end_time": "2021-01-20T00:18:50.741398",
     "exception": false,
     "start_time": "2021-01-20T00:18:49.547970",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "papermill": {
   "duration": 470.060888,
   "end_time": "2021-01-20T00:18:53.354456",
   "environment_variables": {},
   "exception": null,
   "input_path": "__notebook__.ipynb",
   "output_path": "__notebook__.ipynb",
   "parameters": {},
   "start_time": "2021-01-20T00:11:03.293568",
   "version": "2.1.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
